Using the data collected from existing customers, build a model that will help the marketing team identify potential customers who are relatively more likely to subscribe term deposit and thus increase their hit ratio.
file : bank-full.csv file
Input variables: Bank client data:
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score,f1_score,recall_score,precision_score, confusion_matrix
%matplotlib inline
#from yellowbrick.classifier import ClassificationReport, ROCAUC
plt.style.use('ggplot')
pd.options.display.float_format = '{:,.2f}'.format
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))
from sklearn.feature_extraction.text import CountVectorizer #DT does not take strings as input for the model fit step....
bank_df = pd.read_csv("bank-full.csv") # import .csv file
bank_df.head(10)
bank_df.shape
bank_df.info()
bank_df.isnull().sum() # check for null values -- There are no Null Values
print(bank_df.job.value_counts())
print(bank_df.marital.value_counts())
print(bank_df.education.value_counts())
print(bank_df.default.value_counts())
print(bank_df.housing.value_counts())
print(bank_df.loan.value_counts())
print(bank_df.contact.value_counts())
print(bank_df.month.value_counts())
print(bank_df.poutcome.value_counts())
print(bank_df.Target.value_counts())
bank_df.describe() # Print the descriptive statistics of each & every column using describe() function
# Print the descriptive statistics of each & every column using describe() function
# Below shows 1) Balance min has -ve value indicating some accounts have negative credits / owe money to bank
# pdays: number of days that passed by after the client was last contacted from a previous campaign (-1 tells us the person has not been contacted or contact period is beyond 900 days)
# pdays -1 in min, , @ 25% , @50%, @ 75% shows atleast majority of clients have not been contacted beyond 900 days -- they cannot be outliers
# further is analysed by plotting distplot, countplot and box plot
bank_df.describe().transpose()
# using correlation methoid to see different correlation between the variables
plt.figure(figsize=(10,8))
sns.heatmap(bank_df.corr(), annot=True, fmt=".2")
plt.show()
# Checking for Outlier using boxplot for continous columns
#for i in ['age','job', 'maritial','education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'Target']:
# job,martial, education, default, housing, loan, contact, month, poutcome and Target are objects cannout plot
# below try to remove objects and try boxplot
for i in ['age','balance','day', 'duration', 'campaign', 'pdays', 'previous']:
sns.boxplot(bank_df[i])
plt.show()
# distplot for continous columns
#for i in ['age','job', 'maritial','education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'Target']:
# job,martial, education, default, housing, loan, contact, month, poutcome and Target are objects cannout plot
# below try to remove objects and try distplot
for i in ['age','balance','day', 'duration', 'campaign', 'pdays', 'previous']:
sns.distplot(bank_df[i])
plt.show()
# countplot for continous columns
#for i in ['age','job', 'maritial','education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'Target']:
# job,martial, education, default, housing, loan, contact, month, poutcome and Target are objects cannout plot
# below try to remove objects and try countplot
for i in ['age','balance','day', 'duration', 'campaign', 'pdays', 'previous']:
sns.countplot(bank_df[i])
plt.show()
# box for continous columns
#for i in ['age','job', 'maritial','education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'Target']:
# job,martial, education, default, housing, loan, contact, month, poutcome and Target are objects cannot plot
# below try to remove objects and try boxplot
for i in ['age','balance','day', 'duration', 'campaign', 'pdays', 'previous']:
sns.boxplot(bank_df[i])
plt.show()
#levels of Y variable
bank_df['duration'].value_counts()
sns.jointplot(bank_df['age'], bank_df['balance']); # Below plot also shows there are few -ve balances
sns.jointplot(bank_df['age'], bank_df['duration']);
bank_df.head()
sns.lmplot(x="age", y="balance", hue="marital", data=bank_df); # just analyze balance wrt to marital status
sns.lmplot(x="age", y="balance", hue="default", data=bank_df); # just analyze balance - age and defaulter pattern
sns.pairplot(bank_df, hue = 'age') # pair plot to verify w.r.t to "age"
sns.pairplot(bank_df, hue = 'marital') # pair plot to verify w.r.t to "maritial" status
sns.pairplot(bank_df, hue = 'education') # pair plot to verify w.r.t to "education"
bank_df.head(10)
# convert all objects into categorical and replace with dummies with onehot coding where necessary -- integers
replaceStruct = {
"default": {"no": 0, "yes":1},
"housing": {"no": 0, "yes": 1 },
"loan": {"no": 0, "yes": 1 },
"Target": {"no":0,"yes": 1 }
}
oneHotCols=["job","marital","education","contact","month", "poutcome"]
bank_df=bank_df.replace(replaceStruct) # convert all objects into categorical and replace with dummies with onehot coding where necessary -- integers
bank_df=pd.get_dummies(bank_df, columns=oneHotCols)
bank_df.head(10)
bank_df.info() # to checking all objects / categorical columns converted into int64 or uint8
plt.figure(figsize=(40,30))
sns.heatmap(bank_df.corr(),
annot=True,
linewidths=.5,
center=0,
cbar=False,
cmap="YlGnBu")
plt.show()
# using correlation methoid to see different correlation between the variables
plt.figure(figsize=(40,30))
sns.heatmap(bank_df.corr(), annot=True, fmt=".2")
plt.show()
bank_df.columns
# to look at the target variable "Target" to understand how the data is distributed amongst the varous
bank_df.groupby(["Target"]).mean()
# to look at the target variable "loan" to understand how the data is distributed amongst the varous
bank_df.groupby(["loan"]).mean()
# to look at the target variable "default" to understand how the data is distributed amongst the varous
bank_df.groupby(["default"]).mean()
## Define X and Y variables
X=bank_df.drop('default', axis=1)
Y=bank_df[['default']]
#Split into training and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.30, random_state=7)
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, roc_auc_score,accuracy_score
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(random_state=7)
logreg.fit(X_train, y_train) # fit the model on train data
y_predict = logreg.predict(X_test) # Predicting the target variable on test data
# Observe the predicted and observed classes in a dataframe.
z = X_test.copy()
z['Observed'] = y_test
z['Predicted'] = y_predict
z.head()
## function to get confusion matrix in a proper format
def draw_cm( actual, predicted ):
cm = confusion_matrix( actual, predicted)
sns.heatmap(cm, annot=True, fmt='.2f', xticklabels = [0,1] , yticklabels = [0,1] )
plt.ylabel('Observed')
plt.xlabel('Predicted')
plt.show()
print("Training accuracy",logreg.score(X_train,y_train))
print()
print("Testing accuracy",logreg.score(X_test, y_test))
print()
print('Confusion Matrix')
print(draw_cm(y_test,y_predict))
print()
print("Recall:",recall_score(y_test,y_predict))
print()
print("Precision:",precision_score(y_test,y_predict))
print()
print("F1 Score:",f1_score(y_test,y_predict))
print()
print("Roc Auc Score:",roc_auc_score(y_test,y_predict))
# !pip install yellowbrick _ #AUC ROC curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
# Checking Parameters of logistic regression
logreg.get_params()
# Running a loop to check different values of 'solver'
# all solver can be used with l2, only 'liblinear' and 'saga' works with both 'l1' and 'l2'
train_score=[]
test_score=[]
solver = ['newton-cg','lbfgs','liblinear','sag','saga']
for i in solver:
model = LogisticRegression(random_state=42,penalty='l2', C = 0.75,solver=i) # changing values of solver
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
train_score.append(round(model.score(X_train, y_train),3))
test_score.append(round(model.score(X_test, y_test),3))
print(solver)
print()
print(train_score)
print()
print(test_score)
train_score=[]
test_score=[]
solver = ['liblinear','saga'] # changing values of solver which works with 'l1'
for i in solver:
model = LogisticRegression(random_state=42,penalty='l1', C = 0.75,solver=i) #changed penalty to 'l1'
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
train_score.append(round(model.score(X_train, y_train),3))
test_score.append(round(model.score(X_test, y_test),3))
print(solver)
print()
print(train_score)
print()
print(test_score)
model = LogisticRegression(random_state=42,penalty='l1',solver='liblinear',class_weight='balanced') # changing class weight to balanced
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
print("Trainig accuracy",model.score(X_train,y_train))
print()
print("Testing accuracy",model.score(X_test, y_test))
# Running a loop to check different values of 'C'
train_score=[]
test_score=[]
C = [0.01,0.1,0.25,0.5,0.75,1]
for i in C:
model = LogisticRegression(random_state=42,penalty='l1', solver='liblinear',class_weight='balanced', C=i) # changing values of C
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
train_score.append(round(model.score(X_train,y_train),3)) # appending training accuracy in a blank list for every run of the loop
test_score.append(round(model.score(X_test, y_test),3)) # appending testing accuracy in a blank list for every run of the loop
print(C)
print()
print(train_score)
print()
print(test_score)
#Therefore final model is
model = LogisticRegression(random_state=42,penalty='l1', solver='liblinear', class_weight='balanced',C=0.5)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
print("Training accuracy",model.score(X_train,y_train))
print()
print("Testing accuracy",model.score(X_test, y_test))
print()
print('Confusion Matrix')
print(draw_cm(y_test,y_predict))
print()
print("Recall:",recall_score(y_test,y_predict))
print()
print("Precision:",precision_score(y_test,y_predict))
print()
print("F1 Score:",f1_score(y_test,y_predict))
print()
print("Roc Auc Score:",roc_auc_score(y_test,y_predict))
from yellowbrick.classifier import ClassificationReport, ROCAUC
# Visualize model performance with yellowbrick library ----- tried but did not work!!! ??
viz = ClassificationReport(model)
viz.fit(X_train, y_train)
viz.score(X_test, y_test)
viz.show()
roc = ROCAUC(model)
roc.fit(X_train, y_train)
roc.score(X_test, y_test)
roc.show()
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
#Let us break the X and y dataframes into training set and test set. For this we will use
#Sklearn package's data splitting function which is based on random function
from sklearn.model_selection import train_test_split
import numpy as np
import os,sys
from scipy import stats
# calculate accuracy measures and confusion matrix
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')
import statsmodels.api as sm #Build the logistic regression model
logit = sm.Logit(y_train, sm.add_constant(X_train))
lg = logit.fit()
from scipy import stats #Summary of logistic regression
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)
print(lg.summary())
#Calculate Odds Ratio, probability ##create a data frame to collate Odds ratio, probability and p-value of the coef
lgcoef = pd.DataFrame(lg.params, columns=['coef'])
lgcoef.loc[:, "Odds_ratio"] = np.exp(lgcoef.coef)
lgcoef['probability'] = lgcoef['Odds_ratio']/(1+lgcoef['Odds_ratio'])
lgcoef['pval']=lg.pvalues
pd.options.display.float_format = '{:.2f}'.format
# FIlter by significant p-value (pval <0.1) and sort descending by Odds ratio
lgcoef = lgcoef.sort_values(by="Odds_ratio", ascending=False)
pval_filter = lgcoef['pval']<=0.1
lgcoef[pval_filter]
# Building model using the DecisionTreeClassifier function. Using default 'gini' criteria to split.
dTree = DecisionTreeClassifier(criterion = 'gini', random_state=1)
dTree.fit(X_train, y_train)
# Scoring Decision Tree
print(dTree.score(X_train, y_train))
print(dTree.score(X_test, y_test))
#train_char_label = ['No', 'Yes'] # tried graphviz did not work
#dTree_File = open('dTtree.dot','w')
#dot_data = tree.export_graphviz(dTree, out_file=dTree_File, feature_names = list(X_train), class_names = list(train_char_label))
#dTree_File.close()
text_representation = tree.export_text(dTree)
print(text_representation)
train_char_label = ['No', 'Yes']
dTree_File = open('dTree.dot','w')
dot_data = tree.export_graphviz(dTree, out_file=dTree_File, feature_names = list(X_train), class_names = list(train_char_label))
dTree_File.close()
dTreeR = DecisionTreeClassifier(criterion = 'gini', max_depth = 3, random_state=1)
dTreeR.fit(X_train, y_train)
print(dTreeR.score(X_train, y_train))
print(dTreeR.score(X_test, y_test))
# importance of features in the tree building ( The importance of a feature is computed as the
#(normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance )
print (pd.DataFrame(dTreeR.feature_importances_, columns = ["Imp"], index = X_train.columns))
print(dTreeR.score(X_test , y_test))
y_predict = dTreeR.predict(X_test)
cm=metrics.confusion_matrix(y_test, y_predict, labels=[0, 1])
df_cm = pd.DataFrame(cm, index = [i for i in ["No","Yes"]],
columns = [i for i in ["No","Yes"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True ,fmt='g')
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO
from IPython.display import Image
import pydotplus
import graphviz
from sklearn.tree import DecisionTreeClassifier
#BankData_Tree_File = open('bankdatadf.dot','w') ## Creating the dot file for generating the Decision Tree diagram
#export_graphviz(dTreeR, out_file=BankData_Tree_File, filled=True, rounded=True, special_characters=True,feature_names = filtered_columns)
#BankData_Tree_File.close()
#import os
#retCode = os.system("dot -Tpng bankdatadf.dot -o bankdatadf.png")
#if(retCode>0):
# print("system command returning error: "+str(retCode))
#else:
# display(Image("bankdatadf.png"))
print (pd.DataFrame(dTreeR.feature_importances_, columns = ["Imp"], index = X_train.columns)) ## The feature importance matrix for the Decision Tree
from sklearn.ensemble import RandomForestClassifier
randomClassifier = RandomForestClassifier(n_estimators=50, random_state=40)
randomClassifier.fit(X_train, y_train)
y_predict = randomClassifier.predict(X_test)
randomClassifier.score(X_test, y_test)
from sklearn.ensemble import BaggingClassifier
baggingClassifier = BaggingClassifier(base_estimator=dTreeR, n_estimators=50, random_state=40)
baggingClassifier.fit(X_train, y_train)
y_predict = baggingClassifier.predict(X_test)
baggingClassifier.score(X_test, y_test)
from sklearn.ensemble import AdaBoostClassifier
# adaClassifier = AdaBoostClassifier(base_estimator=dTree, n_estimators=50, random_state=40) With base estimator 'dTree' the score was 0.8782070185785904
adaClassifier = AdaBoostClassifier(n_estimators=50, random_state=40) ## Tried to see if without estimator if the performance metrics change. New value is 0.897670303745208
adaClassifier.fit(X_train, y_train)
y_predict = adaClassifier.predict(X_test)
adaClassifier.score(X_test, y_test)